Employee retention data is from "IBM HR Analytics Employee Attrition & Performance" from Kaggle:
https://www.kaggle.com/datasets/pavansubhasht/ibm-hr-analytics-attrition-dataset
Some of the data features include Education, Job Involvement and Satisfaction, Performance, Work life balance, etc...
See data source for more detail
I will understand what features correlate with employee attrition. This information can be used to help predict people who may be planning on leaving there position. Once identified, the company can have a discussion with the employee to understand if there are things within their job and/or career path that they would like to see imporoved. If at risk employees can be identified early, then this will help both the company and employees. The employee can improve their job and life satisfaction with proper intervention. Also, the company can retain their top talent and minimize hiring and training costs.
import altair as alt
import pandas as pd
#alt.renderers.enable('notebook')
alt.data_transformers.disable_max_rows() #Necessary because the violin plot required me to melt the DataFrame from wide to long
#from sklearn.preprocessing import OneHoteEncoder, OrdinalEncoder #Might be necessary with the oridinal data
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
df = pd.read_csv('EmployeeRetention.csv')
typeDict = {'Age':'int16',
'Attrition':'category',
'BusinessTravel':'category',
'DailyRate':'int32',
'Department':'category',
'DistanceFromHome':'int16',
'Education':'category',
'EducationField':'category',
'EmployeeCount':'int32',
'EmployeeNumber':'int32',
'EnvironmentSatisfaction':'category',
'Gender':'category',
'HourlyRate':'int32',
'JobInvolvement':'category',
'JobLevel':'category',
'JobRole':'category',
'JobSatisfaction':'category',
'MaritalStatus':'category',
'MonthlyIncome':'int32',
'MonthlyRate':'int32',
'NumCompaniesWorked':'int16',
'Over18':'bool',
'OverTime':'category',
'PercentSalaryHike':'int16',
'PerformanceRating':'category',
'RelationshipSatisfaction':'category',
'StandardHours':'int16',
'StockOptionLevel':'category',
'TotalWorkingYears':'int16',
'TrainingTimesLastYear':'int16',
'WorkLifeBalance':'category',
'YearsAtCompany':'int16',
'YearsInCurrentRole':'int16',
'YearsSinceLastPromotion':'int16',
'YearsWithCurrManager':'int16'
}
df = df.astype(typeDict)
## Change the features from integers to alphanumeric categories, prior to teaching a model, encorporate OrdinalEncoder
attrition = {'Yes': True,
'No': False}
Over18 = {'Y': True,
'N': False}
OverTime = {'Yes': True,
'No': False}
education = {1:'High School',
2:'College',
3:'Bachelor',
4:'Master',
5:'Doctor'}
environmentSatisfaction = {1:'Low',
2:'Medium',
3:'High',
4:'Very High'}
jobInvolvement = {1:'Low',
2:'Medium',
3:'High',
4:'Very High'}
jobSatisfaction = {1:'Low',
2:'Medium',
3:'High',
4:'Very High'}
performanceRating = {1:'Low',
2:'Good',
3:'Excellent',
4:'Outstanding'}
relationshipSatisfaction = {1:'Low',
2:'Medium',
3:'High',
4:'Very High'}
workLifeBalance = {1:'Bad',
2:'Good',
3:'Better',
4:'Best'}
## Defining the order of the ordinal features
businessOrder = ['Non-Travel','Travel_Rarely','Travel_Frequently']
educationOrder = ['High School','College','Bachelor','Master','Doctor']
ordinalOrder = ['Low','Medium','High','Very High']
performanceOrder = ['Low','Good','Excellent','Outstanding']
workOrder = ['Bad','Good','Better','Best']
jobOrder = [1,2,3,4,5]
stockOrder = [0,1,2,3]
##
df['Attrition'] = df['Attrition'].map(attrition)
df['Over18'] = df['Over18'].map(attrition)
df['OverTime'] = df['OverTime'].map(attrition)
df['Education'] = df['Education'].map(education)
df['EnvironmentSatisfaction'] = df['EnvironmentSatisfaction'].map(environmentSatisfaction)
df['JobInvolvement'] = df['JobInvolvement'].map(jobInvolvement)
df['JobSatisfaction'] = df['JobSatisfaction'].map(jobSatisfaction)
df['PerformanceRating'] = df['PerformanceRating'].map(performanceRating)
df['RelationshipSatisfaction'] = df['RelationshipSatisfaction'].map(relationshipSatisfaction)
df['WorkLifeBalance'] = df['WorkLifeBalance'].map(workLifeBalance)
##
df['Attrition'] = df['Attrition'].astype('bool')
df['Over18'] = df['Over18'].astype('bool')
df['OverTime'] = df['OverTime'].astype('bool')
df['BusinessTravel'] = df['BusinessTravel'].astype(pd.api.types.CategoricalDtype(categories = businessOrder, ordered = True))
df['Education'] = df['Education'].astype(pd.api.types.CategoricalDtype(categories = educationOrder, ordered = True))
df['EnvironmentSatisfaction'] = df['EnvironmentSatisfaction'].astype(pd.api.types.CategoricalDtype(categories = ordinalOrder, ordered = True))
df['JobInvolvement'] = df['JobInvolvement'].astype(pd.api.types.CategoricalDtype(categories = ordinalOrder, ordered = True))
df['JobSatisfaction'] = df['JobSatisfaction'].astype(pd.api.types.CategoricalDtype(categories = ordinalOrder, ordered = True))
df['PerformanceRating'] = df['PerformanceRating'].astype(pd.api.types.CategoricalDtype(categories = performanceOrder, ordered = True))
df['RelationshipSatisfaction'] = df['RelationshipSatisfaction'].astype(pd.api.types.CategoricalDtype(categories = ordinalOrder, ordered = True))
df['WorkLifeBalance'] = df['WorkLifeBalance'].astype(pd.api.types.CategoricalDtype(categories = workOrder, ordered = True))
df['JobLevel'] = df['JobLevel'].astype(pd.api.types.CategoricalDtype(categories = jobOrder, ordered = True))
df['StockOptionLevel'] = df['StockOptionLevel'].astype(pd.api.types.CategoricalDtype(categories = stockOrder, ordered = True))
def ordinalFun(var,val):
"""Assigns an order value based on two inputs. This is used on a dataframe to add an order column for future graphs.
Args:
var (string): This value is the key to the dictionary that has the order as the value
val (variant): This value is used to find what index it is in the list. The list of the dictionary will be in order. The index will be returned
Returns:
variant: ordinal position if in ordinal dictioary, value if not in ordinal dictionary
"""
lookupDict = {'Over18':[False,True],
'OverTime':[False,True],
'BusinessTravel':businessOrder,
'Education':educationOrder,
'EnvironmentSatisfaction':ordinalOrder,
'JobInvolvement':ordinalOrder,
'JobSatisfaction':ordinalOrder,
'PerformanceRating':performanceOrder,
'RelationshipSatisfaction':ordinalOrder,
'WorkLifeBalance':workOrder,
'JobLevel':jobOrder,
'StockOptionLevel':stockOrder}
try:
orderList = lookupDict[var]
for i in range(len(orderList)):
if orderList[i] == val:
orderNumber = i
if var=="JobLevel":
return (str(i+1) + " - " + str(val))
else:
return (str(i) + " - " + str(val))
except:
orderNumber = val
return orderNumber
df.describe(include = 'all')
| Age | Attrition | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EmployeeCount | EmployeeNumber | EnvironmentSatisfaction | Gender | HourlyRate | JobInvolvement | JobLevel | JobRole | JobSatisfaction | MaritalStatus | MonthlyIncome | MonthlyRate | NumCompaniesWorked | Over18 | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | StandardHours | StockOptionLevel | TotalWorkingYears | TrainingTimesLastYear | WorkLifeBalance | YearsAtCompany | YearsInCurrentRole | YearsSinceLastPromotion | YearsWithCurrManager | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1470.000000 | 1470 | 1470 | 1470.000000 | 1470 | 1470.000000 | 1470 | 1470 | 1470.0 | 1470.000000 | 1470 | 1470 | 1470.000000 | 1470 | 1470.0 | 1470 | 1470 | 1470 | 1470.000000 | 1470.000000 | 1470.000000 | 1470 | 1470 | 1470.000000 | 1470 | 1470 | 1470.0 | 1470.0 | 1470.000000 | 1470.000000 | 1470 | 1470.000000 | 1470.000000 | 1470.000000 | 1470.000000 |
| unique | NaN | 2 | 3 | NaN | 3 | NaN | 5 | 6 | NaN | NaN | 4 | 2 | NaN | 4 | 5.0 | 9 | 4 | 3 | NaN | NaN | NaN | 1 | 2 | NaN | 2 | 4 | NaN | 4.0 | NaN | NaN | 4 | NaN | NaN | NaN | NaN |
| top | NaN | False | Travel_Rarely | NaN | Research & Development | NaN | Bachelor | Life Sciences | NaN | NaN | High | Male | NaN | High | 1.0 | Sales Executive | Very High | Married | NaN | NaN | NaN | True | False | NaN | Excellent | High | NaN | 0.0 | NaN | NaN | Better | NaN | NaN | NaN | NaN |
| freq | NaN | 1233 | 1043 | NaN | 961 | NaN | 572 | 606 | NaN | NaN | 453 | 882 | NaN | 868 | 543.0 | 326 | 459 | 673 | NaN | NaN | NaN | 1470 | 1054 | NaN | 1244 | 459 | NaN | 631.0 | NaN | NaN | 893 | NaN | NaN | NaN | NaN |
| mean | 36.923810 | NaN | NaN | 802.485714 | NaN | 9.192517 | NaN | NaN | 1.0 | 1024.865306 | NaN | NaN | 65.891156 | NaN | NaN | NaN | NaN | NaN | 6502.931293 | 14313.103401 | 2.693197 | NaN | NaN | 15.209524 | NaN | NaN | 80.0 | NaN | 11.279592 | 2.799320 | NaN | 7.008163 | 4.229252 | 2.187755 | 4.123129 |
| std | 9.135373 | NaN | NaN | 403.509100 | NaN | 8.106864 | NaN | NaN | 0.0 | 602.024335 | NaN | NaN | 20.329428 | NaN | NaN | NaN | NaN | NaN | 4707.956783 | 7117.786044 | 2.498009 | NaN | NaN | 3.659938 | NaN | NaN | 0.0 | NaN | 7.780782 | 1.289271 | NaN | 6.126525 | 3.623137 | 3.222430 | 3.568136 |
| min | 18.000000 | NaN | NaN | 102.000000 | NaN | 1.000000 | NaN | NaN | 1.0 | 1.000000 | NaN | NaN | 30.000000 | NaN | NaN | NaN | NaN | NaN | 1009.000000 | 2094.000000 | 0.000000 | NaN | NaN | 11.000000 | NaN | NaN | 80.0 | NaN | 0.000000 | 0.000000 | NaN | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 30.000000 | NaN | NaN | 465.000000 | NaN | 2.000000 | NaN | NaN | 1.0 | 491.250000 | NaN | NaN | 48.000000 | NaN | NaN | NaN | NaN | NaN | 2911.000000 | 8047.000000 | 1.000000 | NaN | NaN | 12.000000 | NaN | NaN | 80.0 | NaN | 6.000000 | 2.000000 | NaN | 3.000000 | 2.000000 | 0.000000 | 2.000000 |
| 50% | 36.000000 | NaN | NaN | 802.000000 | NaN | 7.000000 | NaN | NaN | 1.0 | 1020.500000 | NaN | NaN | 66.000000 | NaN | NaN | NaN | NaN | NaN | 4919.000000 | 14235.500000 | 2.000000 | NaN | NaN | 14.000000 | NaN | NaN | 80.0 | NaN | 10.000000 | 3.000000 | NaN | 5.000000 | 3.000000 | 1.000000 | 3.000000 |
| 75% | 43.000000 | NaN | NaN | 1157.000000 | NaN | 14.000000 | NaN | NaN | 1.0 | 1555.750000 | NaN | NaN | 83.750000 | NaN | NaN | NaN | NaN | NaN | 8379.000000 | 20461.500000 | 4.000000 | NaN | NaN | 18.000000 | NaN | NaN | 80.0 | NaN | 15.000000 | 3.000000 | NaN | 9.000000 | 7.000000 | 3.000000 | 7.000000 |
| max | 60.000000 | NaN | NaN | 1499.000000 | NaN | 29.000000 | NaN | NaN | 1.0 | 2068.000000 | NaN | NaN | 100.000000 | NaN | NaN | NaN | NaN | NaN | 19999.000000 | 26999.000000 | 9.000000 | NaN | NaN | 25.000000 | NaN | NaN | 80.0 | NaN | 40.000000 | 6.000000 | NaN | 40.000000 | 18.000000 | 15.000000 | 17.000000 |
#Determining balance of labels - Unbalanced: 16%==True
df.value_counts('Attrition')
Attrition False 1233 True 237 dtype: int64
Things to note:
del df['EmployeeCount'] #All values were equal to 1
del df['StandardHours'] #All values were equal to 80
continuous_field = list(df.select_dtypes(include='number').columns)
ordinal_field = ['Over18','OverTime','Education','EnvironmentSatisfaction','JobInvolvement','JobSatisfaction','PerformanceRating','RelationshipSatisfaction','WorkLifeBalance','JobLevel','StockOptionLevel','BusinessTravel']
categorical_field = list(set(df.columns)-set(continuous_field)-set(['Attrition'])-set(ordinal_field))
## Defining Categorical Data Frame
categoricalDF = df[list(set(categorical_field)|set(["Attrition"]))]
categoricalDF = pd.melt(categoricalDF,id_vars = ['Attrition'], value_vars = list(set(categoricalDF.columns)-set(['Attrition'])))
## Defining Ordinal Data Frame
ordinalDF = df[list(set(ordinal_field)|set(["Attrition"]))]
ordinalDF = pd.melt(ordinalDF,id_vars = ['Attrition'], value_vars = list(set(ordinalDF.columns)-set(['Attrition'])))
ordinalDF['order'] = ordinalDF.apply(lambda row: ordinalFun(row.variable, row.value), axis = 1) # Adding order to the dataframe for future graphs
## Defining Continuous Data Frame
continuousDF = df[list(set(continuous_field)|set(["Attrition"]))]
continuousDF = pd.melt(continuousDF,id_vars = ['Attrition'], value_vars = list(set(continuousDF.columns)-set(['Attrition'])))
The purpose of exploratory data analysis is to gain some insights in the data. For this data set, I'm going to break it down into categorical, ordinal, and continuous features. Continuous features include, but not limited to, Monthly Income, Total Working Years, Distance From Home and other quantitiative data. Categorial features include, but not limited to, Performance Rating, Education, and Job Satisfaction.
The tasks that I need to accomplish in this sections:
Violin Plot Progress:
Made the decision between swarm and violin plot by the features of altair appearing to only have jitter capability. The swarm plot would bring all of the points towards the center axis; thus, making it easier to compare two distributions. I chose not to use seaborn because I liked altairs selection interaction and a violin plot would achieve the same tasks.
The first iteration of the violin plot was a long display with a different plot for each continuous variable. This provided enough information to complete the task of understanding the distribution of the values grouped by the Attrition label. The purpose of this was to understand if there was a different distribution of a values depending on the attrition label.
To improve the initial design of the visualization, I deceided to create a drop down menu to make the display more compact. This could remove the users ability to compare two different variables distributions because they would not be able to see the two plots at the same time. To mitigate this, I could plot the two side by side. I'll ask some people for their input
Future work could be done to make the title and axis label change dynamically with the selection. I would also like to have this chart superimposed over a swarm plot.
Scatter Plot Progress:
Stacked Bar Chart Progress:
## VIOLIN PLOT - All in one long plot
for i in range(len(continuous_field)):
yAxis = continuous_field[i]
violin_chart = alt.Chart(df).transform_density(
yAxis,
as_=[yAxis, 'density'],
counts = True,
groupby = ['Attrition'],
#maxsteps = 1000
).mark_area(orient='horizontal').encode(
y=str(yAxis+":Q"),
color='Attrition:N',
x=alt.X(
'density:Q',
stack = 'center',
impute = None,
title = None,
axis = alt.Axis(labels = False, values = [0],grid = False, ticks = False),
),
column=alt.Column(
'Attrition:N',
header=alt.Header(
titleOrient = 'bottom',
labelOrient = 'top',
labelPadding = 0,
),
)
).configure_title(
align = 'center'
).properties(
title = yAxis + " vs. Attrition Rates",
width = 150
).configure_facet(
spacing = 0
).configure_view(
stroke = None
).interactive()
violin_chart.display()
## 1 - Ordinal data not in order
## 2 - Can I get ordinal data and categorical data using the appropriate color schemes (Shade varying for ordinal, rainbow for categorial)
dropDownList = list(categoricalDF.variable.unique())
dropDownList.sort()
selection = alt.selection_single(name = "Categorial", fields = ['variable'], bind = alt.binding_select(options=dropDownList), init = {'variable':"Department"})
stackedBarCat = alt.Chart(categoricalDF
).mark_bar(
#).add_selection(selection
#).transform_filter(selection
).encode(
y = alt.Y('count(Attrition)',stack='normalize'),
x = alt.X('Attrition', axis = alt.Axis(labelAngle=0)),
color = alt.Color('value:N',legend=alt.Legend(title = "Categorical Variable", orient='right')),
#Megan, get the color to switch with the order moving to descending. Also, find a way to support ordinal and categorial. Might be two different graphs
tooltip = ['variable','value']
).properties(width = 150, height = 250, title = 'Categorical Variables'
).configure_legend(labelLimit=150,labelPadding=100
).add_selection(selection
).transform_filter(selection
)
stackedBarCat.display()
## 1 - Ordinal data not in order
## 2 - Can I get ordinal data and categorical data using the appropriate color schemes (Shade varying for ordinal, rainbow for categorial)
dropDownList = list(ordinalDF.variable.unique())
dropDownList.sort()
selection = alt.selection_single(name = "Ordinal", fields = ['variable'], bind = alt.binding_select(options=dropDownList), init = {'variable':"BusinessTravel"})
stackedBarCat = alt.Chart(ordinalDF
).mark_bar(
#).add_selection(selection
#).transform_filter(selection
).encode(
y = alt.Y('count(Attrition)',stack='normalize'),
x = alt.X('Attrition', axis = alt.Axis(labelAngle=0)),
color = alt.Color('order:O',legend=alt.Legend(title = "Ordinal Variable", orient='right')),
#Megan, get the color to switch with the order moving to descending. Also, find a way to support ordinal and categorial. Might be two different graphs
tooltip = ['variable','value']
).properties(width = 150, height = 250, title = 'Ordinal Variables'
).configure_legend(labelLimit=150,labelPadding=100
).add_selection(selection
).transform_filter(selection
)
stackedBarCat.display()
['JobInvolvement' 'JobSatisfaction' 'RelationshipSatisfaction' 'WorkLifeBalance' 'BusinessTravel' 'PerformanceRating' 'Education' 'JobLevel' 'Over18' 'OverTime' 'EnvironmentSatisfaction' 'StockOptionLevel'] ['Department' 'MaritalStatus' 'JobRole' 'EducationField' 'Gender']